clear all
set more off
set mem 10000000
set matsize 10000
version 13

****************************************************************** 
*** Build File to Process RGGVY Scraped Microdata ****************
****************************************************************** 

** Set file paths
do "$path_code/paths.do"

********************************************************************************
********************************************************************************

** RGGVY program data 

/* NOTE: these data come from an extensive webscraping script, and have 
   already been cleaned and formatted into a Stata dataset. This script
   cleans the name fields for use in the fuzzy habitation merge algorithm.
   We do not use these microdata in our main analysis, as they are not 
   very reliable.
*/

{
use "$rggvy/rggvy_combined_master.dta", clear
drop coverage_cat-bk_nt_com vi_code
duplicates tag st_code dt_code bk_code vi_code2, gen(dup_sdv_rggvy)
replace village = trim(subinstr(village,"æ¼ã¹¦","",.))
replace village = trim(subinstr(village,"æ¼ã¹¥","",.))
replace village = trim(subinstr(village,"æ¼ã¸°","",.))
replace village = trim(subinstr(village,"æ¼ã¹¦","",.))
replace village = upper(trim(itrim(village)))
replace block = subinstr(subinstr(subinstr(block,"*","",.)," -"," ",.),"- "," ",.)
replace block = subinstr(subinstr(subinstr(subinstr(block,"( ","(",.)," )",")",.),"-I"," I",.),"-1"," I",.)
replace block = subinstr(subinstr(subinstr(block,"(T)","",.),"(S.T)","",.),"(M)","",.)
replace block = subinstr(subinstr(subinstr(block,"C.D.BLOCK","",.),"SUB-DIV.","",.),"SUB-DIVISION","",.)
replace block = subinstr(subinstr(subinstr(block,"(P)","",.),"CIRCLE","",.),"CD BLOCK","",.)
replace block = upper(trim(itrim(block)))
replace district = upper(trim(itrim(subinstr(district,"*","",.))))
drop if dup>0 & v_id_cov==. & dpr_code==""
drop if dup>0 & v_id_com==. & dpr_code==""
duplicates tag st_code dt_code bk_code vi_code2, gen(dup_sdv_rggvy2)
drop if dup_sdv_rggvy2>0 & v_id_cov==.
duplicates r st_code dt_code bk_code vi_code2
drop v_id_cov v_id_com dup*
rename bk_code bk_code_rggvy
rename vi_code2 vi_code
rename district district_rggvy
rename block block_rggvy
rename village village_rggvy
destring dpr_code, replace
compress
duplicates drop
save "$rggvy/rggvy_master_names.dta", replace
}


********************************************************************************
********************************************************************************

** Prep RGGVY habitation list, in order to merge in names

/* NOTE: these data come from lists of covered habitation published on 
   the RGGVY website. Unfortunately, they are not very consistent with the
   2003/2009 habitation census, and hence we do not rely on them in our 
   analysis. This script cleans and formats the into a Stata dataset, which 
   we use in the fuzzy habitaiton merge algorithm.
*/
{
* Read xls files into stata
{
clear
cd "$loh"
local myfilelist : dir "." files  "*.xls"
foreach file of local myfilelist{
 clear
 local sheet_name = upper(substr("`file'",1,length("`file'")-4))
 
 capture noi{
   import excel using "`file'", firstrow sh("`sheet_name'")
   if "`file'"!="andhra_pradesh.xls"{
     append using rggvy_list_of_habs
   }
   save rggvy_list_of_habs, replace
 }
 
 foreach i in "1" "2" "3" "4" {
  clear
  local sheet_name = upper(substr("`file'",1,length("`file'")-4)) + " " + "`i'"
  
  capture noi{
   import excel using "`file'", firstrow sh("`sheet_name'")
   append using rggvy_list_of_habs
   save rggvy_list_of_habs, replace
  }
 }
}
}

*Clean and process variables
{
use "$loh/rggvy_list_of_habs.dta", clear
tab STATE_NAME
duplicates drop

rename STATE_CD st_code
rename STATE_NAME state
rename DISTT_CD dt_code
rename DISTT_NAME district
rename BLK_CD bk_code
rename BLK_NAME block
rename VILL_CD vi_code
rename VILL_NAME village
rename HAM_NAME hab

la var st_code "state code"
la var dt_code "district code"
la var bk_code "block code"
la var vi_code "village code"
la var state "state name"
la var district "district name"
la var block "block name"
la var village "village name"
la var hab "habitation name"

destring st_code, replace
destring dt_code, replace
destring bk_code, replace
destring vi_code, replace

compress
save "$rggvy/rggvy_list_of_habs.dta", replace
}

* Reformat as an input to the fuzzy habitation merge algorithm
{
use "$loh/rggvy_list_of_habs.dta", clear
rename state state_lh
rename district district_lh
rename block block_lh
rename village village_lh
replace state_lh = subinstr(state_lh,"`char'","",.)
replace district_lh = subinstr(district_lh,"`char'","",.)
replace block_lh = subinstr(block_lh,"`char'","",.)
replace village_lh = subinstr(village_lh,"`char'","",.)
local char2 = substr(upper(trim(itrim(district_lh[347389]))),-1,1)
replace state_lh = subinstr(state_lh,"`char2'","",.)
replace district_lh = subinstr(district_lh,"`char2'","",.)
replace block_lh = subinstr(block_lh,"`char2'","",.)
replace village_lh = subinstr(village_lh,"`char2'","",.)
replace district_lh = "Senapati" if trim(district_lh)=="Senapati (Excluding 3 Sub-Divisions)" & dt_code==272
replace district_lh = upper(trim(itrim(subinstr(district_lh,"*","",.))))
replace block_lh = subinstr(subinstr(subinstr(block_lh,"*","",.)," -"," ",.),"- "," ",.)
replace block_lh = subinstr(subinstr(subinstr(subinstr(block_lh,"( ","(",.)," )",")",.),"-I"," I",.),"-1"," I",.)
replace block_lh = subinstr(subinstr(subinstr(block_lh,"(T)","",.),"(S.T)","",.),"(M)","",.)
replace block_lh = subinstr(subinstr(subinstr(block_lh,"C.D.BLOCK","",.),"SUB-DIV.","",.),"SUB-DIVISION","",.)
replace block_lh = subinstr(subinstr(subinstr(block_lh,"(P)","",.),"CIRCLE","",.),"CD BLOCK","",.)
replace block_lh = upper(trim(itrim(block_lh)))
replace village_lh = "" if length(village_lh)==244
replace village_lh = upper(trim(itrim(village_lh)))
duplicates drop
gen lh_hab_count = _n
collapse (count) lh_hab_count, by(st_code-village_lh)
rename vi_code vi_code11
rename st_code st_code11
rename dt_code dt_code11
rename bk_code bk_code11_lh
compress
duplicates drop
gen lh_id = _n
merge m:m st_code11 dt_code11 vi_code11 using "$conc/census_code_matches_names.dta", keep(1 3) nogen
keep *lh* st_code dt_code bk_code_conc01 vi_code
rename bk_code_conc bk_code_lh
drop state_lh
compress
duplicates drop 
save "$loh/rggvy_list_of_habs_names.dta", replace
}
}

********************************************************************************
********************************************************************************